001    /*
002     * PairwiseAlignmentAlgorithm.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    import java.io.Reader;
035    import java.io.IOException;
036    
037    /**
038     * This abstract class is the superclass of all classes implementing pairwise sequence
039     * alignment algorithms. Subclasses are required to provide methods to build a high
040     * scoring alignment between two sequences and compute its score with a given scoring
041     * scheme.
042     *
043     * <P>Clients are required to set a scoring scheme and load two sequences before
044     * requesting an alignment or the computation of its score. They typically make the
045     * following sequence of method calls:</P>
046     *
047     * <CODE><BLOCKQUOTE><PRE>
048     * // prepare
049     * PairwiseAlignmentAlgorithm algorithm = new SomePairwiseAlignmentAlgorith ();
050     * algorithm.setScoringScheme (some_scoring_scheme);
051     * algorithm.loadSequences (sequence1, sequence2);
052     *
053     * // now compute the alignment
054     * PairwiseAlignment alignment = algorithm.getPairwiseAlignment();
055     * int score = algorithm.getScore();
056     * </PRE></BLOCKQUOTE></CODE>
057     *
058     * @author Sergio A. de Carvalho Jr.
059     * @see PairwiseAlignment
060     */
061    public abstract class PairwiseAlignmentAlgorithm
062    {
063            /**
064             * Tag character that signals a match in the score tag line of an alignment. Its use
065             * is conditioned by the <CODE>use_match_tag</CODE> flag.
066             *
067             * @see #use_match_tag
068             * @see #useMatchTag
069             */
070            protected static final char MATCH_TAG = '|';
071    
072            /**
073             * Tag character that signals an approximate match in the score tag line of an
074             * alignment.
075             */
076            protected static final char APPROXIMATE_MATCH_TAG = '+';
077    
078            /**
079             * Character that signals a mismatch in the score tag line of an alignment.
080             */
081            protected static final char MISMATCH_TAG = ' ';
082    
083            /**
084             * Character that signals a gap in the score tag line of an alignment.
085             */
086            protected static final char GAP_TAG = ' ';
087    
088            /**
089             * Character that signals a gap in sequence.
090             */
091            protected static final char GAP_CHARACTER = '-';
092    
093            /**
094             * Indicates if the <CODE>MATCH_TAG</CODE> tag should be used or not. If it is
095             * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE>
096             * tag in the score tag line of the alignment whenever a match occurs between
097             * characters of the two sequences. If it is <CODE>false</CODE> the matching character
098             * should be written instead. This flag is updated whenever a scoring scheme is set to
099             * this <CODE>PairwiseAlignmentAlgorithm</CODE> by the <CODE>setScoringScheme</CODE>
100             * method.
101             *
102             * @see #MATCH_TAG
103             * @see #useMatchTag
104             * @see #setScoringScheme
105             */
106            protected boolean use_match_tag;
107    
108            /**
109             * The scoring scheme used to compute a pairwise sequence alignment. It must be set
110             * before performing the alignment, and if a new scoring scheme is set, any alignment
111             * or score already computed is lost.
112             */
113            protected ScoringScheme scoring;
114    
115            /**
116             * Stores the product of the last pairwise alignment performed. It contains a string
117             * representation of a highest scoring alignment between the two sequences and its
118             * score. It is set after a successful execution of the
119             * <CODE>computePairwiseAlignment</CODE> method that subclasses must implement. It is
120             * set to null if new sequences are loaded or a new scoring scheme is set.
121             */
122            protected PairwiseAlignment alignment;
123    
124            /**
125             * This field stores just the score of the last pairwise alignment performed (if the
126             * <CODE>score_computed flag</CODE> is set to true). It is useful when just the score
127             * is needed (and not the alignment itselft). Its value is set after a successful
128             * execution of both <CODE>computePairwiseAlignment</CODE> or
129             * <CODE>computeScore</CODE> methods that subclasses must implement. If new sequences
130             * are loaded or a new scoring scheme is set, the <CODE>score_computed</CODE> flag is
131             * set to false, and this field's value becomes undefined.
132             */
133            protected int score;
134    
135            /**
136             * Flags whether the score of the alignment between the last two loaded sequences has
137             * already been computed. It is set to true after a successful execution of both
138             * <CODE>computePairwiseAlignment</CODE> or <CODE>computeScore</CODE> methods that
139             * subclasses must implement. It is set to falsef if new sequences are loaded or a new
140             * scoring scheme is set.
141             */
142            protected boolean score_computed = false;
143    
144            /**
145             * Flags whether sequences have been loaded. It is set to true after subclasses
146             * successfully load two sequences.
147             */
148            protected boolean sequences_loaded = false;
149    
150            /**
151             * Sets the scoring scheme to be used for the next alignments. Any alignment or score
152             * already computed is lost. If the scoring scheme supports partial matches, this
153             * <CODE>PairwiseAlignmentAlgorithm</CODE> is set not to use the
154             * <CODE>MATCH_TAG</CODE> tag because in this case the score tag line be confusing.
155             * If the scoring scheme does not support partial matches, then the use of the
156             * <CODE>MATCH_TAG</CODE> tag is enabled.
157             *
158             * @param scoring Scoring scheme to be used
159             * @see #MATCH_TAG
160             * @see ScoringScheme#isPartialMatchSupported
161             */
162            public void setScoringScheme (ScoringScheme scoring)
163            {
164                    if (scoring == null)
165                            throw new IllegalArgumentException ("Null scoring scheme object.");
166    
167                    this.scoring = scoring;
168    
169                    // if the scoring scheme supports partial matches,
170                    // disable the use of the MATCH_TAG character
171                    if (scoring.isPartialMatchSupported())
172                            this.use_match_tag = false;
173                    else
174                            this.use_match_tag = true;
175    
176                    // when a new scoring scheme is set,
177                    // the alignment needs to be recomputed
178                    this.alignment = null;
179                    this.score_computed = false;
180            }
181    
182            /**
183             * Tells wether the <CODE>MATCH_TAG</CODE> tag should be used or not. If it returns
184             * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE>
185             * tag in the score tag line of the alignment produced whenever a match occurs between
186             * characters of the two sequences. If it returns <CODE>false</CODE> the matching
187             * character should be written instead. The value returned is conditioned by the
188             * <CODE>use_match_tag</CODE> flag, which is updated whenever a scoring scheme is set
189             * to this <CODE>PairwiseAlignmentAlgorithm</CODE> by the
190             * <CODE>setScoringScheme</CODE> method.
191             *
192             * @return <CODE>true</CODE if the <CODE>MATCH_TAG</CODE> tag should be used,
193             * <CODE>false</CODE> otherwise
194             * @see #MATCH_TAG
195             * @see #use_match_tag
196             * @see #setScoringScheme
197             */
198            protected boolean useMatchTag ()
199            {
200                    return use_match_tag;
201            }
202    
203            /**
204             * Request subclasses to load the sequences according to their own needs. Any
205             * alignment and score already computed is lost. If no exception is raised, the loaded
206             * flag is set to true. Subclasses typically store the sequences in instances of an
207             * appropiate class and each can have its own contract, so check each algorithm to see
208             * what kind of sequences it produces. Input can come from any source provided they
209             * are encapsulated with a proper Reader. They must be ready to be read, i.e. the next
210             * read operation must return the sequence's first character.
211             *
212             * @param input1 First sequence
213             * @param input2 Second sequence
214             * @throws IOException If an I/O error occurs when reading the sequences
215             * @throws InvalidSequenceException If the sequences are not valid
216             */
217            public void loadSequences (Reader input1, Reader input2)
218                    throws IOException, InvalidSequenceException
219            {
220                    // when new sequences are loaded, the
221                    // alignment and score needs to be recomputed
222                    this.alignment = null;
223                    this.score_computed = false;
224    
225                    // make sure that if an exception is raised
226                    // the sequences_loaded flag is false
227                    this.sequences_loaded = false;
228    
229                    // request subclasses to load sequences
230                    loadSequencesInternal (input1, input2);
231    
232                    // if no exception is raised,
233                    // set the loaded flag to true
234                    this.sequences_loaded = true;
235            }
236    
237            /**
238             * Frees pointer to loaded sequences and computed alignments (if any) so that their
239             * data can be garbage collected.
240             */
241            public void unloadSequences ()
242            {
243                    // allow any alignment already computed
244                    // to be garbage collected
245                    this.alignment = null;
246                    this.score_computed = false;
247    
248                    // request subclasses to unload sequences
249                    unloadSequencesInternal ();
250    
251                    this.sequences_loaded = false;
252            }
253    
254            /**
255             * Return the last pairwise alignment computed (if any) or request subclasses to
256             * compute one and return the result by calling the
257             * <CODE>computePairwiseAlignment</CODE> method. The sequences must already be loaded
258             * and a scoring scheme must already be set.
259             *
260             * @return a pairwise alignment between the loaded sequences
261             * @throws IncompatibleScoringSchemeException If the scoring scheme
262             * is not compatible with the loaded sequences
263             * @see #computePairwiseAlignment
264             */
265            public PairwiseAlignment getPairwiseAlignment ()
266                    throws IncompatibleScoringSchemeException
267            {
268                    if (!sequences_loaded)
269                            throw new IllegalStateException ("Sequences have not been loaded.");
270    
271                    if (scoring == null)
272                            throw new IllegalStateException ("Scoring scheme has not been set.");
273    
274                    if (this.alignment == null)
275                    {
276                            // make sure the scoring scheme won't be changed
277                            // in the middle of the alignment computation
278                            synchronized (scoring)
279                            {
280                                    // compute the alignment if it hasn't been computed yet
281                                    this.alignment = computePairwiseAlignment();
282                            }
283    
284                            // store the score as well
285                            this.score = this.alignment.getScore();
286                            this.score_computed = true;
287                    }
288    
289                    return this.alignment;
290            }
291    
292            /**
293             * Returns the score of the last alignment computed (if any) or request subclasses to
294             * compute one and return the result by calling the <CODE>computeScore</CODE> method.
295             * The sequences must already be loaded and a scoring scheme must already be set.
296             *
297             * @return the score of the alignment between the loaded sequences
298             * @throws IncompatibleScoringSchemeException If the scoring scheme
299             * is not compatible with the loaded sequences
300             * @see #computeScore
301             */
302            public int getScore () throws IncompatibleScoringSchemeException
303            {
304                    if (!sequences_loaded)
305                            throw new IllegalStateException ("Sequences have not been loaded.");
306    
307                    if (scoring == null)
308                            throw new IllegalStateException ("Scoring scheme has not been set.");
309    
310                    if (!score_computed)
311                    {
312                            // make sure the scoring scheme won't be changed
313                            // in the middle of the alignment computation
314                            synchronized (scoring)
315                            {
316                                    // compute the alignment's score if it hasn't been computed yet
317                                    this.score = computeScore();
318                            }
319    
320                            this.score_computed = true;
321                    }
322    
323                    return this.score;
324            }
325    
326            /**
327             * Subclasses must implement this method to load sequences according to their own
328             * needs and throw an exception in case of any failure. If no exception is raised, the
329             * loaded flag is set to true by the public method and the sequences are believed to
330             * be loaded (so an alignment or score can be requested).
331             *
332             * @param input1 First sequence
333             * @param input2 Second sequence
334             * @throws IOException If an I/O error occurs when reading the sequences
335             * @throws InvalidSequenceException If the sequences are not valid
336             * @see #loadSequences
337             * @see CharSequence
338             * @see FactorSequence
339             */
340            protected abstract void loadSequencesInternal (Reader input1, Reader input2)
341                    throws IOException, InvalidSequenceException;
342    
343            /**
344             * Subclasses must implement this method to unload sequences according to their own
345             * storage, freeing pointers to sequences and any intermediate data so that they can
346             * be garbage collected. This methid is called by the public
347             * <CODE>unloadSequences</CODE> method.
348             *
349             * @see #unloadSequences
350             */
351            protected abstract void unloadSequencesInternal ();
352    
353            /**
354             * Subclasses must implement this method to compute an alignment between the loaded
355             * sequences using the scoring scheme previously set. This methid is called by the
356             * <CODE>getPairwiseAlignment</CODE> method when needed.
357             *
358             * @return a pairwise alignment between the loaded sequences
359             * @throws IncompatibleScoringSchemeException If the scoring scheme
360             * is not compatible with the loaded sequences
361             * @see #getPairwiseAlignment
362             */
363            protected abstract PairwiseAlignment computePairwiseAlignment ()
364                    throws IncompatibleScoringSchemeException;
365    
366            /**
367             * Subclasses must implement this method to compute the score of the alignment between
368             * the loaded sequences using the scoring scheme previously set. This methid is called
369             * by the <CODE>getScore</CODE> method when needed.
370             *
371             * @return the score of the alignment between the loaded sequences
372             * @throws IncompatibleScoringSchemeException If the scoring scheme
373             * is not compatible with the loaded sequences
374             * @see #getScore
375             */
376            protected abstract int computeScore () throws IncompatibleScoringSchemeException;
377    
378            /**
379             * Helper method to invoke the <CODE>scoreSubstitution</CODE> method of the scoring
380             * scheme set to this algorithm.
381             *
382             * @param a first character
383             * @param b second character
384             * @return score of substitution of <CODE>a</CODE> for <CODE>b</CODE>
385             * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
386             * with the sequences being aligned
387             * @see ScoringScheme#scoreSubstitution
388             */
389            protected final int scoreSubstitution (char a, char b)
390                    throws IncompatibleScoringSchemeException
391            {
392                    return scoring.scoreSubstitution (a, b);
393            }
394    
395            /**
396             * Helper method to invoke the <CODE>scoreInsertion</CODE> method of the scoring
397             * scheme set to this algorithm.
398             *
399             * @param a the character to be inserted
400             * @return score of insertion of <CODE>a</CODE>
401             * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
402             * with the sequences being aligned
403             * @see ScoringScheme#scoreInsertion
404             */
405            protected final int scoreInsertion (char a) throws IncompatibleScoringSchemeException
406            {
407                    return scoring.scoreInsertion (a);
408            }
409    
410            /**
411             * Helper method to invoke the <CODE>scoreDeletion</CODE> method of the scoring scheme
412             * set to this algorithm.
413             *
414             * @param a the character to be deleted
415             * @return score of deletion of <CODE>a</CODE>
416             * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
417             * with the sequences being aligned
418             * @see ScoringScheme#scoreDeletion
419             */
420            protected final int scoreDeletion (char a) throws IncompatibleScoringSchemeException
421            {
422                    return scoring.scoreDeletion (a);
423            }
424    
425            /**
426             * Helper method to compute the the greater of two values.
427             *
428             * @param v1 first value
429             * @param v2 second value
430             * @return the larger of <CODE>v1</CODE> and <CODE>v2</CODE>
431             */
432            protected final int max (int v1, int v2)
433            {
434                    return (v1 >= v2) ? v1 : v2;
435            }
436    
437            /**
438             * Helper method to compute the the greater of three values.
439             *
440             * @param v1 first value
441             * @param v2 second value
442             * @param v3 third value
443             * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> and <CODE>v3</CODE>
444             */
445            protected final int max (int v1, int v2, int v3)
446            {
447                    return (v1 >= v2) ? ((v1 >= v3)? v1 : v3) : ((v2 >= v3)? v2 : v3);
448            }
449    
450            /**
451             * Helper method to compute the the greater of four values.
452             *
453             * @param v1 first value
454             * @param v2 second value
455             * @param v3 third value
456             * @param v4 fourth value
457             * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> <CODE>v3</CODE> and
458             * <CODE>v4</CODE>
459             */
460            protected final int max (int v1, int v2, int v3, int v4)
461            {
462                    int m1 = ((v1 >= v2) ? v1 : v2);
463                    int m2 = ((v3 >= v4) ? v3 : v4);
464    
465                    return (m1 >= m2) ? m1 : m2;
466            }
467    }